library(tidyverse)
library(assertr)
library(readxl)
library(here)
library(janitor)
y2015_candy_data <- read_xlsx(here::here("raw_data/candy_ranking_data/boing-boing-candy-2015.xlsx"))
y2016_candy_data <- read_xlsx(here::here("raw_data/candy_ranking_data/boing-boing-candy-2016.xlsx"))
y2017_candy_data <- read_xlsx(here::here("raw_data/candy_ranking_data/boing-boing-candy-2017.xlsx"))
New names:
* `` -> ...114
Woah, dat’s a bit more data. Let’s have a look. Also need to have a look at this message “New names: * `` -> …114”
glimpse(y2015_candy_data)
Rows: 5,630
Columns: 124
$ Timestamp <dttm> …
$ `How old are you?` <chr> …
$ `Are you going actually going trick or treating yourself?` <chr> …
$ `[Butterfinger]` <chr> …
$ `[100 Grand Bar]` <chr> …
$ `[Anonymous brown globs that come in black and orange wrappers]` <chr> …
$ `[Any full-sized candy bar]` <chr> …
$ `[Black Jacks]` <chr> …
$ `[Bonkers]` <chr> …
$ `[Bottle Caps]` <chr> …
$ `[Box’o’ Raisins]` <chr> …
$ `[Brach products (not including candy corn)]` <chr> …
$ `[Bubble Gum]` <chr> …
$ `[Cadbury Creme Eggs]` <chr> …
$ `[Candy Corn]` <chr> …
$ `[Vials of pure high fructose corn syrup, for main-lining into your vein]` <chr> …
$ `[Candy that is clearly just the stuff given out for free at restaurants]` <chr> …
$ `[Cash, or other forms of legal tender]` <chr> …
$ `[Chiclets]` <chr> …
$ `[Caramellos]` <chr> …
$ `[Snickers]` <chr> …
$ `[Dark Chocolate Hershey]` <chr> …
$ `[Dental paraphenalia]` <chr> …
$ `[Dots]` <chr> …
$ `[Fuzzy Peaches]` <chr> …
$ `[Generic Brand Acetaminophen]` <chr> …
$ `[Glow sticks]` <chr> …
$ `[Broken glow stick]` <chr> …
$ `[Goo Goo Clusters]` <chr> …
$ `[Good N' Plenty]` <chr> …
$ `[Gum from baseball cards]` <chr> …
$ `[Gummy Bears straight up]` <chr> …
$ `[Creepy Religious comics/Chick Tracts]` <chr> …
$ `[Healthy Fruit]` <chr> …
$ `[Heath Bar]` <chr> …
$ `[Hershey’s Kissables]` <chr> …
$ `[Hershey’s Milk Chocolate]` <chr> …
$ `[Hugs (actual physical hugs)]` <chr> …
$ `[Jolly Rancher (bad flavor)]` <chr> …
$ `[Jolly Ranchers (good flavor)]` <chr> …
$ `[Kale smoothie]` <chr> …
$ `[Kinder Happy Hippo]` <chr> …
$ `[Kit Kat]` <chr> …
$ `[Hard Candy]` <chr> …
$ `[Lapel Pins]` <chr> …
$ `[LemonHeads]` <chr> …
$ `[Licorice]` <chr> …
$ `[Licorice (not black)]` <chr> …
$ `[Lindt Truffle]` <chr> …
$ `[Lollipops]` <chr> …
$ `[Mars]` <chr> …
$ `[Mary Janes]` <chr> …
$ `[Maynards]` <chr> …
$ `[Milk Duds]` <chr> …
$ `[LaffyTaffy]` <chr> …
$ `[Minibags of chips]` <chr> …
$ `[JoyJoy (Mit Iodine)]` <chr> …
$ `[Reggie Jackson Bar]` <chr> …
$ `[Pixy Stix]` <chr> …
$ `[Nerds]` <chr> …
$ `[Nestle Crunch]` <chr> …
$ `[Now'n'Laters]` <chr> …
$ `[Pencils]` <chr> …
$ `[Milky Way]` <chr> …
$ `[Reese’s Peanut Butter Cups]` <chr> …
$ `[Tolberone something or other]` <chr> …
$ `[Runts]` <chr> …
$ `[Junior Mints]` <chr> …
$ `[Senior Mints]` <chr> …
$ `[Mint Kisses]` <chr> …
$ `[Mint Juleps]` <chr> …
$ `[Mint Leaves]` <chr> …
$ `[Peanut M&M’s]` <chr> …
$ `[Regular M&Ms]` <chr> …
$ `[Mint M&Ms]` <chr> …
$ `[Ribbon candy]` <chr> …
$ `[Rolos]` <chr> …
$ `[Skittles]` <chr> …
$ `[Smarties (American)]` <chr> …
$ `[Smarties (Commonwealth)]` <chr> …
$ `[Chick-o-Sticks (we don’t know what that is)]` <chr> …
$ `[Spotted Dick]` <chr> …
$ `[Starburst]` <chr> …
$ `[Swedish Fish]` <chr> …
$ `[Sweetums]` <chr> …
$ `[Those odd marshmallow circus peanut things]` <chr> …
$ `[Three Musketeers]` <chr> …
$ `[Peterson Brand Sidewalk Chalk]` <chr> …
$ `[Peanut Butter Bars]` <chr> …
$ `[Peanut Butter Jars]` <chr> …
$ `[Trail Mix]` <chr> …
$ `[Twix]` <chr> …
$ `[Vicodin]` <chr> …
$ `[White Bread]` <chr> …
$ `[Whole Wheat anything]` <chr> …
$ `[York Peppermint Patties]` <chr> …
$ `Please leave any remarks or comments regarding your choices.` <chr> …
$ `Please list any items not included above that give you JOY.` <chr> …
$ `Please list any items not included above that give you DESPAIR.` <chr> …
$ `Guess the number of mints in my hand.` <chr> …
$ `Betty or Veronica?` <chr> …
$ `Check all that apply: "I cried tears of sadness at the end of ____________"` <chr> …
$ `"That dress* that went viral early this year - when I first saw it, it was ________"` <chr> …
$ `Fill in the blank: "Taylor Swift is a force for ___________"` <lgl> …
$ `What is your favourite font?` <chr> …
$ `If you squint really hard, the words "Intelligent Design" would look like.` <chr> …
$ `Fill in the blank: "Imitation is a form of ____________"` <chr> …
$ `Please estimate the degree(s) of separation you have from the following celebrities [JK Rowling]` <chr> …
$ `Please estimate the degree(s) of separation you have from the following celebrities [JJ Abrams]` <chr> …
$ `Please estimate the degree(s) of separation you have from the following celebrities [Beyoncé]` <chr> …
$ `Please estimate the degree(s) of separation you have from the following celebrities [Bieber]` <chr> …
$ `Please estimate the degree(s) of separation you have from the following celebrities [Kevin Bacon]` <chr> …
$ `Please estimate the degree(s) of separation you have from the following celebrities [Francis Bacon (1561 - 1626)]` <chr> …
$ `[Sea-salt flavored stuff, probably chocolate, since this is the "it" flavor of the year]` <chr> …
$ `[Necco Wafers]` <chr> …
$ `Which day do you prefer, Friday or Sunday?` <chr> …
$ `Please estimate the degrees of separation you have from the following folks [Bruce Lee]` <lgl> …
$ `Please estimate the degrees of separation you have from the following folks [JK Rowling]` <lgl> …
$ `Please estimate the degrees of separation you have from the following folks [Malala Yousafzai]` <lgl> …
$ `Please estimate the degrees of separation you have from the following folks [Thom Yorke]` <lgl> …
$ `Please estimate the degrees of separation you have from the following folks [JJ Abrams]` <lgl> …
$ `Please estimate the degrees of separation you have from the following folks [Hillary Clinton]` <lgl> …
$ `Please estimate the degrees of separation you have from the following folks [Donald Trump]` <lgl> …
$ `Please estimate the degrees of separation you have from the following folks [Beyoncé Knowles]` <lgl> …
glimpse(y2016_candy_data)
Rows: 1,259
Columns: 123
$ Timestamp <dttm> …
$ `Are you going actually going trick or treating yourself?` <chr> …
$ `Your gender:` <chr> …
$ `How old are you?` <chr> …
$ `Which country do you live in?` <chr> …
$ `Which state, province, county do you live in?` <chr> …
$ `[100 Grand Bar]` <chr> …
$ `[Anonymous brown globs that come in black and orange wrappers]` <chr> …
$ `[Any full-sized candy bar]` <chr> …
$ `[Black Jacks]` <chr> …
$ `[Bonkers (the candy)]` <chr> …
$ `[Bonkers (the board game)]` <chr> …
$ `[Bottle Caps]` <chr> …
$ `[Box'o'Raisins]` <chr> …
$ `[Broken glow stick]` <chr> …
$ `[Butterfinger]` <chr> …
$ `[Cadbury Creme Eggs]` <chr> …
$ `[Candy Corn]` <chr> …
$ `[Candy that is clearly just the stuff given out for free at restaurants]` <chr> …
$ `[Caramellos]` <chr> …
$ `[Cash, or other forms of legal tender]` <chr> …
$ `[Chardonnay]` <chr> …
$ `[Chick-o-Sticks (we don’t know what that is)]` <chr> …
$ `[Chiclets]` <chr> …
$ `[Coffee Crisp]` <chr> …
$ `[Creepy Religious comics/Chick Tracts]` <chr> …
$ `[Dental paraphenalia]` <chr> …
$ `[Dots]` <chr> …
$ `[Dove Bars]` <chr> …
$ `[Fuzzy Peaches]` <chr> …
$ `[Generic Brand Acetaminophen]` <chr> …
$ `[Glow sticks]` <chr> …
$ `[Goo Goo Clusters]` <chr> …
$ `[Good N' Plenty]` <chr> …
$ `[Gum from baseball cards]` <chr> …
$ `[Gummy Bears straight up]` <chr> …
$ `[Hard Candy]` <chr> …
$ `[Healthy Fruit]` <chr> …
$ `[Heath Bar]` <chr> …
$ `[Hershey's Dark Chocolate]` <chr> …
$ `[Hershey’s Milk Chocolate]` <chr> …
$ `[Hershey's Kisses]` <chr> …
$ `[Hugs (actual physical hugs)]` <chr> …
$ `[Jolly Rancher (bad flavor)]` <chr> …
$ `[Jolly Ranchers (good flavor)]` <chr> …
$ `[JoyJoy (Mit Iodine!)]` <chr> …
$ `[Junior Mints]` <chr> …
$ `[Senior Mints]` <chr> …
$ `[Kale smoothie]` <chr> …
$ `[Kinder Happy Hippo]` <chr> …
$ `[Kit Kat]` <chr> …
$ `[LaffyTaffy]` <chr> …
$ `[LemonHeads]` <chr> …
$ `[Licorice (not black)]` <chr> …
$ `[Licorice (yes black)]` <chr> …
$ `[Lindt Truffle]` <chr> …
$ `[Lollipops]` <chr> …
$ `[Mars]` <chr> …
$ `[Mary Janes]` <chr> …
$ `[Maynards]` <chr> …
$ `[Mike and Ike]` <chr> …
$ `[Milk Duds]` <chr> …
$ `[Milky Way]` <chr> …
$ `[Regular M&Ms]` <chr> …
$ `[Peanut M&M’s]` <chr> …
$ `[Blue M&M's]` <chr> …
$ `[Red M&M's]` <chr> …
$ `[Third Party M&M's]` <chr> …
$ `[Minibags of chips]` <chr> …
$ `[Mint Kisses]` <chr> …
$ `[Mint Juleps]` <chr> …
$ `[Mr. Goodbar]` <chr> …
$ `[Necco Wafers]` <chr> …
$ `[Nerds]` <chr> …
$ `[Nestle Crunch]` <chr> …
$ `[Now'n'Laters]` <chr> …
$ `[Peeps]` <chr> …
$ `[Pencils]` <chr> …
$ `[Person of Interest Season 3 DVD Box Set (not including Disc 4 with hilarious outtakes)]` <chr> …
$ `[Pixy Stix]` <chr> …
$ `[Reese’s Peanut Butter Cups]` <chr> …
$ `[Reese's Pieces]` <chr> …
$ `[Reggie Jackson Bar]` <chr> …
$ `[Rolos]` <chr> …
$ `[Skittles]` <chr> …
$ `[Smarties (American)]` <chr> …
$ `[Smarties (Commonwealth)]` <chr> …
$ `[Snickers]` <chr> …
$ `[Sourpatch Kids (i.e. abominations of nature)]` <chr> …
$ `[Spotted Dick]` <chr> …
$ `[Starburst]` <chr> …
$ `[Sweet Tarts]` <chr> …
$ `[Swedish Fish]` <chr> …
$ `[Sweetums (a friend to diabetes)]` <chr> …
$ `[Tic Tacs]` <chr> …
$ `[Those odd marshmallow circus peanut things]` <chr> …
$ `[Three Musketeers]` <chr> …
$ `[Tolberone something or other]` <chr> …
$ `[Trail Mix]` <chr> …
$ `[Twix]` <chr> …
$ `[Vials of pure high fructose corn syrup, for main-lining into your vein]` <chr> …
$ `[Vicodin]` <chr> …
$ `[Whatchamacallit Bars]` <chr> …
$ `[White Bread]` <chr> …
$ `[Whole Wheat anything]` <chr> …
$ `[York Peppermint Patties]` <chr> …
$ `Please list any items not included above that give you JOY.` <chr> …
$ `Please list any items not included above that give you DESPAIR.` <chr> …
$ `Please leave any witty, snarky or thoughtful remarks or comments regarding your choices.` <chr> …
$ `Guess the number of mints in my hand.` <chr> …
$ `Betty or Veronica?` <chr> …
$ `"That dress* that went viral a few years back - when I first saw it, it was ________"` <chr> …
$ `What is your favourite font?` <chr> …
$ `Please estimate the degree(s) of separation you have from the following celebrities [JK Rowling]` <chr> …
$ `Please estimate the degree(s) of separation you have from the following celebrities [JJ Abrams]` <chr> …
$ `Please estimate the degree(s) of separation you have from the following celebrities [Beyoncé]` <chr> …
$ `Please estimate the degree(s) of separation you have from the following celebrities [Bieber]` <chr> …
$ `Please estimate the degree(s) of separation you have from the following celebrities [Kevin Bacon]` <chr> …
$ `Please estimate the degree(s) of separation you have from the following celebrities [Francis Bacon (1561 - 1626)]` <chr> …
$ `Which day do you prefer, Friday or Sunday?` <chr> …
$ `Do you eat apples the correct way, East to West (side to side) or do you eat them like a freak of nature, South to North (bottom to top)?` <chr> …
$ `When you see the above image of the 4 different websites, which one would you most likely check out (please be honest).` <chr> …
$ `[York Peppermint Patties] Ignore` <lgl> …
glimpse(y2017_candy_data)
Rows: 2,460
Columns: 120
$ `Internal ID` <dbl> …
$ `Q1: GOING OUT?` <chr> …
$ `Q2: GENDER` <chr> …
$ `Q3: AGE` <chr> …
$ `Q4: COUNTRY` <chr> …
$ `Q5: STATE, PROVINCE, COUNTY, ETC` <chr> …
$ `Q6 | 100 Grand Bar` <chr> …
$ `Q6 | Anonymous brown globs that come in black and orange wrappers\t(a.k.a. Mary Janes)` <chr> …
$ `Q6 | Any full-sized candy bar` <chr> …
$ `Q6 | Black Jacks` <chr> …
$ `Q6 | Bonkers (the candy)` <chr> …
$ `Q6 | Bonkers (the board game)` <chr> …
$ `Q6 | Bottle Caps` <chr> …
$ `Q6 | Box'o'Raisins` <chr> …
$ `Q6 | Broken glow stick` <chr> …
$ `Q6 | Butterfinger` <chr> …
$ `Q6 | Cadbury Creme Eggs` <chr> …
$ `Q6 | Candy Corn` <chr> …
$ `Q6 | Candy that is clearly just the stuff given out for free at restaurants` <chr> …
$ `Q6 | Caramellos` <chr> …
$ `Q6 | Cash, or other forms of legal tender` <chr> …
$ `Q6 | Chardonnay` <chr> …
$ `Q6 | Chick-o-Sticks (we don’t know what that is)` <chr> …
$ `Q6 | Chiclets` <chr> …
$ `Q6 | Coffee Crisp` <chr> …
$ `Q6 | Creepy Religious comics/Chick Tracts` <chr> …
$ `Q6 | Dental paraphenalia` <chr> …
$ `Q6 | Dots` <chr> …
$ `Q6 | Dove Bars` <chr> …
$ `Q6 | Fuzzy Peaches` <chr> …
$ `Q6 | Generic Brand Acetaminophen` <chr> …
$ `Q6 | Glow sticks` <chr> …
$ `Q6 | Goo Goo Clusters` <chr> …
$ `Q6 | Good N' Plenty` <chr> …
$ `Q6 | Gum from baseball cards` <chr> …
$ `Q6 | Gummy Bears straight up` <chr> …
$ `Q6 | Hard Candy` <chr> …
$ `Q6 | Healthy Fruit` <chr> …
$ `Q6 | Heath Bar` <chr> …
$ `Q6 | Hershey's Dark Chocolate` <chr> …
$ `Q6 | Hershey’s Milk Chocolate` <chr> …
$ `Q6 | Hershey's Kisses` <chr> …
$ `Q6 | Hugs (actual physical hugs)` <chr> …
$ `Q6 | Jolly Rancher (bad flavor)` <chr> …
$ `Q6 | Jolly Ranchers (good flavor)` <chr> …
$ `Q6 | JoyJoy (Mit Iodine!)` <chr> …
$ `Q6 | Junior Mints` <chr> …
$ `Q6 | Senior Mints` <chr> …
$ `Q6 | Kale smoothie` <chr> …
$ `Q6 | Kinder Happy Hippo` <chr> …
$ `Q6 | Kit Kat` <chr> …
$ `Q6 | LaffyTaffy` <chr> …
$ `Q6 | LemonHeads` <chr> …
$ `Q6 | Licorice (not black)` <chr> …
$ `Q6 | Licorice (yes black)` <chr> …
$ `Q6 | Lindt Truffle` <chr> …
$ `Q6 | Lollipops` <chr> …
$ `Q6 | Mars` <chr> …
$ `Q6 | Maynards` <chr> …
$ `Q6 | Mike and Ike` <chr> …
$ `Q6 | Milk Duds` <chr> …
$ `Q6 | Milky Way` <chr> …
$ `Q6 | Regular M&Ms` <chr> …
$ `Q6 | Peanut M&M’s` <chr> …
$ `Q6 | Blue M&M's` <chr> …
$ `Q6 | Red M&M's` <chr> …
$ `Q6 | Green Party M&M's` <chr> …
$ `Q6 | Independent M&M's` <chr> …
$ `Q6 | Abstained from M&M'ing.` <chr> …
$ `Q6 | Minibags of chips` <chr> …
$ `Q6 | Mint Kisses` <chr> …
$ `Q6 | Mint Juleps` <chr> …
$ `Q6 | Mr. Goodbar` <chr> …
$ `Q6 | Necco Wafers` <chr> …
$ `Q6 | Nerds` <chr> …
$ `Q6 | Nestle Crunch` <chr> …
$ `Q6 | Now'n'Laters` <chr> …
$ `Q6 | Peeps` <chr> …
$ `Q6 | Pencils` <chr> …
$ `Q6 | Pixy Stix` <chr> …
$ `Q6 | Real Housewives of Orange County Season 9 Blue-Ray` <chr> …
$ `Q6 | Reese’s Peanut Butter Cups` <chr> …
$ `Q6 | Reese's Pieces` <chr> …
$ `Q6 | Reggie Jackson Bar` <chr> …
$ `Q6 | Rolos` <chr> …
$ `Q6 | Sandwich-sized bags filled with BooBerry Crunch` <chr> …
$ `Q6 | Skittles` <chr> …
$ `Q6 | Smarties (American)` <chr> …
$ `Q6 | Smarties (Commonwealth)` <chr> …
$ `Q6 | Snickers` <chr> …
$ `Q6 | Sourpatch Kids (i.e. abominations of nature)` <chr> …
$ `Q6 | Spotted Dick` <chr> …
$ `Q6 | Starburst` <chr> …
$ `Q6 | Sweet Tarts` <chr> …
$ `Q6 | Swedish Fish` <chr> …
$ `Q6 | Sweetums (a friend to diabetes)` <chr> …
$ `Q6 | Take 5` <chr> …
$ `Q6 | Tic Tacs` <chr> …
$ `Q6 | Those odd marshmallow circus peanut things` <chr> …
$ `Q6 | Three Musketeers` <chr> …
$ `Q6 | Tolberone something or other` <chr> …
$ `Q6 | Trail Mix` <chr> …
$ `Q6 | Twix` <chr> …
$ `Q6 | Vials of pure high fructose corn syrup, for main-lining into your vein` <chr> …
$ `Q6 | Vicodin` <chr> …
$ `Q6 | Whatchamacallit Bars` <chr> …
$ `Q6 | White Bread` <chr> …
$ `Q6 | Whole Wheat anything` <chr> …
$ `Q6 | York Peppermint Patties` <chr> …
$ `Q7: JOY OTHER` <chr> …
$ `Q8: DESPAIR OTHER` <chr> …
$ `Q9: OTHER COMMENTS` <chr> …
$ `Q10: DRESS` <chr> …
$ ...114 <chr> …
$ `Q11: DAY` <chr> …
$ `Q12: MEDIA [Daily Dish]` <dbl> …
$ `Q12: MEDIA [Science]` <dbl> …
$ `Q12: MEDIA [ESPN]` <dbl> …
$ `Q12: MEDIA [Yahoo]` <dbl> …
$ `Click Coordinates (x, y)` <chr> …
Ok, so first of all, the message above refers to the fact that some columns have no names in the data so during the read in R has assigned some names for the columns. Looks like it refers specifically to column 114 so should have a look at this.
First thoughts:
Hints from notes:
After reading the extra info about the data:
And after reading the questions:
names(y2015_candy_data)
names(y2016_candy_data)
names(y2017_candy_data)
After reviewing the notes, the following other ideas come to mind:
Should we just use janitor for this one in order to at least get the column names into better order?
We need to check on the amount of NA’s, work out what type they are (hopefully not MNARs!) and then pick one of the following strategies:
No matter what the decision is, it should be justified.
Allow these to guide you decisions on pivoting etc.
**Right let’s have do a little more investigation and then see what we can come up with. Start with Na’s
y2015_candy_data %>%
summarise(across(.fns = ~sum(is.na(.x))))
y2016_candy_data %>%
summarise(across(.fns = ~sum(is.na(.x))))
y2017_candy_data %>%
summarise(across(.fns = ~sum(is.na(.x))))
Right, so we’ve got a significant amount of Na’s, but I’m guessing since the first questsion tells us not to count any nas that means that we shouldn’t replace them with anything but that we should keep them in by the time we get to that question. With this in mind, I’m deciding to leave the NAs in for now.
Think I’m going to go for cleaning the column names now.
cc_y2015_candy_data <-
clean_names(y2015_candy_data)
cc_y2015_candy_data
cc_y2016_candy_data <-
clean_names(y2016_candy_data)
cc_y2016_candy_data
cc_y2017_candy_data <-
clean_names(y2017_candy_data)
cc_y2017_candy_data
Cool, that’s looking a bit better now. Think that the next thing to do is to work out which columns can be dropped from all of the columns. The questions should definitely dictate this. I’m planning on doing this individually for each and then year before any join. Not sure if this is the right way to go about it but it’s the way i’m feeling most comfortable with to be able to understand the process that is needed. Also will need to make sure that data hasn’t gone into the wrong column before deleted columns.
cc_y2016_candy_data %>%
distinct(which_country_do_you_live_in)
cc_y2016_candy_data %>%
distinct(which_state_province_county_do_you_live_in)
cc_y2017_candy_data %>%
distinct(q4_country)
cc_y2017_candy_data %>%
distinct(q5_state_province_county_etc)
Ok, it looks like we’ve not got any countries lingering in the county/state columns so I think we can add this to the list that we take out. This means, in my mind, that we can go ahead and take out the variables from each of the datasets that we don’t need. Anything that is a candy or gives us some sort of information on age, gender or country is what we need to hold onto. Also anything that might allow us to fill in the blanks we have in terms of these variables.
cc_y2015_candy_data %>%
names()
cc_y2016_candy_data %>%
names()
cc_y2017_candy_data %>%
names
[1] "internal_id"
[2] "q1_going_out"
[3] "q2_gender"
[4] "q3_age"
[5] "q4_country"
[6] "q5_state_province_county_etc"
[7] "q6_100_grand_bar"
[8] "q6_anonymous_brown_globs_that_come_in_black_and_orange_wrappers_a_k_a_mary_janes"
[9] "q6_any_full_sized_candy_bar"
[10] "q6_black_jacks"
[11] "q6_bonkers_the_candy"
[12] "q6_bonkers_the_board_game"
[13] "q6_bottle_caps"
[14] "q6_boxo_raisins"
[15] "q6_broken_glow_stick"
[16] "q6_butterfinger"
[17] "q6_cadbury_creme_eggs"
[18] "q6_candy_corn"
[19] "q6_candy_that_is_clearly_just_the_stuff_given_out_for_free_at_restaurants"
[20] "q6_caramellos"
[21] "q6_cash_or_other_forms_of_legal_tender"
[22] "q6_chardonnay"
[23] "q6_chick_o_sticks_we_don_t_know_what_that_is"
[24] "q6_chiclets"
[25] "q6_coffee_crisp"
[26] "q6_creepy_religious_comics_chick_tracts"
[27] "q6_dental_paraphenalia"
[28] "q6_dots"
[29] "q6_dove_bars"
[30] "q6_fuzzy_peaches"
[31] "q6_generic_brand_acetaminophen"
[32] "q6_glow_sticks"
[33] "q6_goo_goo_clusters"
[34] "q6_good_n_plenty"
[35] "q6_gum_from_baseball_cards"
[36] "q6_gummy_bears_straight_up"
[37] "q6_hard_candy"
[38] "q6_healthy_fruit"
[39] "q6_heath_bar"
[40] "q6_hersheys_dark_chocolate"
[41] "q6_hershey_s_milk_chocolate"
[42] "q6_hersheys_kisses"
[43] "q6_hugs_actual_physical_hugs"
[44] "q6_jolly_rancher_bad_flavor"
[45] "q6_jolly_ranchers_good_flavor"
[46] "q6_joy_joy_mit_iodine"
[47] "q6_junior_mints"
[48] "q6_senior_mints"
[49] "q6_kale_smoothie"
[50] "q6_kinder_happy_hippo"
[51] "q6_kit_kat"
[52] "q6_laffy_taffy"
[53] "q6_lemon_heads"
[54] "q6_licorice_not_black"
[55] "q6_licorice_yes_black"
[56] "q6_lindt_truffle"
[57] "q6_lollipops"
[58] "q6_mars"
[59] "q6_maynards"
[60] "q6_mike_and_ike"
[61] "q6_milk_duds"
[62] "q6_milky_way"
[63] "q6_regular_m_ms"
[64] "q6_peanut_m_m_s"
[65] "q6_blue_m_ms"
[66] "q6_red_m_ms"
[67] "q6_green_party_m_ms"
[68] "q6_independent_m_ms"
[69] "q6_abstained_from_m_ming"
[70] "q6_minibags_of_chips"
[71] "q6_mint_kisses"
[72] "q6_mint_juleps"
[73] "q6_mr_goodbar"
[74] "q6_necco_wafers"
[75] "q6_nerds"
[76] "q6_nestle_crunch"
[77] "q6_nown_laters"
[78] "q6_peeps"
[79] "q6_pencils"
[80] "q6_pixy_stix"
[81] "q6_real_housewives_of_orange_county_season_9_blue_ray"
[82] "q6_reese_s_peanut_butter_cups"
[83] "q6_reeses_pieces"
[84] "q6_reggie_jackson_bar"
[85] "q6_rolos"
[86] "q6_sandwich_sized_bags_filled_with_boo_berry_crunch"
[87] "q6_skittles"
[88] "q6_smarties_american"
[89] "q6_smarties_commonwealth"
[90] "q6_snickers"
[91] "q6_sourpatch_kids_i_e_abominations_of_nature"
[92] "q6_spotted_dick"
[93] "q6_starburst"
[94] "q6_sweet_tarts"
[95] "q6_swedish_fish"
[96] "q6_sweetums_a_friend_to_diabetes"
[97] "q6_take_5"
[98] "q6_tic_tacs"
[99] "q6_those_odd_marshmallow_circus_peanut_things"
[100] "q6_three_musketeers"
[101] "q6_tolberone_something_or_other"
[102] "q6_trail_mix"
[103] "q6_twix"
[104] "q6_vials_of_pure_high_fructose_corn_syrup_for_main_lining_into_your_vein"
[105] "q6_vicodin"
[106] "q6_whatchamacallit_bars"
[107] "q6_white_bread"
[108] "q6_whole_wheat_anything"
[109] "q6_york_peppermint_patties"
[110] "q7_joy_other"
[111] "q8_despair_other"
[112] "q9_other_comments"
[113] "q10_dress"
[114] "x114"
[115] "q11_day"
[116] "q12_media_daily_dish"
[117] "q12_media_science"
[118] "q12_media_espn"
[119] "q12_media_yahoo"
[120] "click_coordinates_x_y"
cc_y2015_candy_data %>%
distinct(please_leave_any_remarks_or_comments_regarding_your_choices)
Right, so looking at the 2015 data, we’ve obviously not got anything direct information for which country they are from. But we do have a few things that could indicate which country they are from. In particular, i’m thinking the type of smarties (american or commonwealth = USA or Canada?) and even the degrees of separation questions (low on j.k. rowling and thom yorke = UK?) Don’t think the same can be done for gender. I think we’ll go with these in terms of inclusion for now plus any other comments.
# hasn't quite worked
ds_group_count <- function(x){
group_by(x) %>%
count(x)
return(count(x))
}
# plan B
cc_y2015_candy_data %>%
group_by(please_estimate_the_degrees_of_separation_you_have_from_the_following_folks_malala_yousafzai) %>%
count(please_estimate_the_degrees_of_separation_you_have_from_the_following_folks_malala_yousafzai)
#I know this might be a bit of a pain to drop columns by name, but if there is a problem of ordering or the like, I really don't want to lose the information we need. I've noticed that some of the degrees of separation questions duplicate celebrities, with some of the duplicates not containing values. I've made a wee chunk of code above to double check if there are values and if so how many.
chosen_cols_2015 <-
cc_y2015_candy_data %>%
subset(
select = -c(
timestamp,
please_leave_any_remarks_or_comments_regarding_your_choices:
please_estimate_the_degree_s_of_separation_you_have_from_the_following_celebrities_francis_bacon_1561_1626,
which_day_do_you_prefer_friday_or_sunday:please_estimate_the_degrees_of_separation_you_have_from_the_following_folks_beyonce_knowles)
)
chosen_cols_2015
NA
cc_y2016_candy_data %>%
names()
[1] "timestamp"
[2] "are_you_going_actually_going_trick_or_treating_yourself"
[3] "your_gender"
[4] "how_old_are_you"
[5] "which_country_do_you_live_in"
[6] "which_state_province_county_do_you_live_in"
[7] "x100_grand_bar"
[8] "anonymous_brown_globs_that_come_in_black_and_orange_wrappers"
[9] "any_full_sized_candy_bar"
[10] "black_jacks"
[11] "bonkers_the_candy"
[12] "bonkers_the_board_game"
[13] "bottle_caps"
[14] "boxo_raisins"
[15] "broken_glow_stick"
[16] "butterfinger"
[17] "cadbury_creme_eggs"
[18] "candy_corn"
[19] "candy_that_is_clearly_just_the_stuff_given_out_for_free_at_restaurants"
[20] "caramellos"
[21] "cash_or_other_forms_of_legal_tender"
[22] "chardonnay"
[23] "chick_o_sticks_we_don_t_know_what_that_is"
[24] "chiclets"
[25] "coffee_crisp"
[26] "creepy_religious_comics_chick_tracts"
[27] "dental_paraphenalia"
[28] "dots"
[29] "dove_bars"
[30] "fuzzy_peaches"
[31] "generic_brand_acetaminophen"
[32] "glow_sticks"
[33] "goo_goo_clusters"
[34] "good_n_plenty"
[35] "gum_from_baseball_cards"
[36] "gummy_bears_straight_up"
[37] "hard_candy"
[38] "healthy_fruit"
[39] "heath_bar"
[40] "hersheys_dark_chocolate"
[41] "hershey_s_milk_chocolate"
[42] "hersheys_kisses"
[43] "hugs_actual_physical_hugs"
[44] "jolly_rancher_bad_flavor"
[45] "jolly_ranchers_good_flavor"
[46] "joy_joy_mit_iodine"
[47] "junior_mints"
[48] "senior_mints"
[49] "kale_smoothie"
[50] "kinder_happy_hippo"
[51] "kit_kat"
[52] "laffy_taffy"
[53] "lemon_heads"
[54] "licorice_not_black"
[55] "licorice_yes_black"
[56] "lindt_truffle"
[57] "lollipops"
[58] "mars"
[59] "mary_janes"
[60] "maynards"
[61] "mike_and_ike"
[62] "milk_duds"
[63] "milky_way"
[64] "regular_m_ms"
[65] "peanut_m_m_s"
[66] "blue_m_ms"
[67] "red_m_ms"
[68] "third_party_m_ms"
[69] "minibags_of_chips"
[70] "mint_kisses"
[71] "mint_juleps"
[72] "mr_goodbar"
[73] "necco_wafers"
[74] "nerds"
[75] "nestle_crunch"
[76] "nown_laters"
[77] "peeps"
[78] "pencils"
[79] "person_of_interest_season_3_dvd_box_set_not_including_disc_4_with_hilarious_outtakes"
[80] "pixy_stix"
[81] "reese_s_peanut_butter_cups"
[82] "reeses_pieces"
[83] "reggie_jackson_bar"
[84] "rolos"
[85] "skittles"
[86] "smarties_american"
[87] "smarties_commonwealth"
[88] "snickers"
[89] "sourpatch_kids_i_e_abominations_of_nature"
[90] "spotted_dick"
[91] "starburst"
[92] "sweet_tarts"
[93] "swedish_fish"
[94] "sweetums_a_friend_to_diabetes"
[95] "tic_tacs"
[96] "those_odd_marshmallow_circus_peanut_things"
[97] "three_musketeers"
[98] "tolberone_something_or_other"
[99] "trail_mix"
[100] "twix"
[101] "vials_of_pure_high_fructose_corn_syrup_for_main_lining_into_your_vein"
[102] "vicodin"
[103] "whatchamacallit_bars"
[104] "white_bread"
[105] "whole_wheat_anything"
[106] "york_peppermint_patties"
[107] "please_list_any_items_not_included_above_that_give_you_joy"
[108] "please_list_any_items_not_included_above_that_give_you_despair"
[109] "please_leave_any_witty_snarky_or_thoughtful_remarks_or_comments_regarding_your_choices"
[110] "guess_the_number_of_mints_in_my_hand"
[111] "betty_or_veronica"
[112] "that_dress_that_went_viral_a_few_years_back_when_i_first_saw_it_it_was"
[113] "what_is_your_favourite_font"
[114] "please_estimate_the_degree_s_of_separation_you_have_from_the_following_celebrities_jk_rowling"
[115] "please_estimate_the_degree_s_of_separation_you_have_from_the_following_celebrities_jj_abrams"
[116] "please_estimate_the_degree_s_of_separation_you_have_from_the_following_celebrities_beyonce"
[117] "please_estimate_the_degree_s_of_separation_you_have_from_the_following_celebrities_bieber"
[118] "please_estimate_the_degree_s_of_separation_you_have_from_the_following_celebrities_kevin_bacon"
[119] "please_estimate_the_degree_s_of_separation_you_have_from_the_following_celebrities_francis_bacon_1561_1626"
[120] "which_day_do_you_prefer_friday_or_sunday"
[121] "do_you_eat_apples_the_correct_way_east_to_west_side_to_side_or_do_you_eat_them_like_a_freak_of_nature_south_to_north_bottom_to_top"
[122] "when_you_see_the_above_image_of_the_4_different_websites_which_one_would_you_most_likely_check_out_please_be_honest"
[123] "york_peppermint_patties_ignore"
#Chosen columns to go in
chosen_cols_2016 <-
cc_y2016_candy_data %>%
subset(
select = -c(
timestamp,
which_state_province_county_do_you_live_in,
please_list_any_items_not_included_above_that_give_you_joy:york_peppermint_patties_ignore)
)
chosen_cols_2016
cc_y2017_candy_data %>%
names
[1] "internal_id"
[2] "q1_going_out"
[3] "q2_gender"
[4] "q3_age"
[5] "q4_country"
[6] "q5_state_province_county_etc"
[7] "q6_100_grand_bar"
[8] "q6_anonymous_brown_globs_that_come_in_black_and_orange_wrappers_a_k_a_mary_janes"
[9] "q6_any_full_sized_candy_bar"
[10] "q6_black_jacks"
[11] "q6_bonkers_the_candy"
[12] "q6_bonkers_the_board_game"
[13] "q6_bottle_caps"
[14] "q6_boxo_raisins"
[15] "q6_broken_glow_stick"
[16] "q6_butterfinger"
[17] "q6_cadbury_creme_eggs"
[18] "q6_candy_corn"
[19] "q6_candy_that_is_clearly_just_the_stuff_given_out_for_free_at_restaurants"
[20] "q6_caramellos"
[21] "q6_cash_or_other_forms_of_legal_tender"
[22] "q6_chardonnay"
[23] "q6_chick_o_sticks_we_don_t_know_what_that_is"
[24] "q6_chiclets"
[25] "q6_coffee_crisp"
[26] "q6_creepy_religious_comics_chick_tracts"
[27] "q6_dental_paraphenalia"
[28] "q6_dots"
[29] "q6_dove_bars"
[30] "q6_fuzzy_peaches"
[31] "q6_generic_brand_acetaminophen"
[32] "q6_glow_sticks"
[33] "q6_goo_goo_clusters"
[34] "q6_good_n_plenty"
[35] "q6_gum_from_baseball_cards"
[36] "q6_gummy_bears_straight_up"
[37] "q6_hard_candy"
[38] "q6_healthy_fruit"
[39] "q6_heath_bar"
[40] "q6_hersheys_dark_chocolate"
[41] "q6_hershey_s_milk_chocolate"
[42] "q6_hersheys_kisses"
[43] "q6_hugs_actual_physical_hugs"
[44] "q6_jolly_rancher_bad_flavor"
[45] "q6_jolly_ranchers_good_flavor"
[46] "q6_joy_joy_mit_iodine"
[47] "q6_junior_mints"
[48] "q6_senior_mints"
[49] "q6_kale_smoothie"
[50] "q6_kinder_happy_hippo"
[51] "q6_kit_kat"
[52] "q6_laffy_taffy"
[53] "q6_lemon_heads"
[54] "q6_licorice_not_black"
[55] "q6_licorice_yes_black"
[56] "q6_lindt_truffle"
[57] "q6_lollipops"
[58] "q6_mars"
[59] "q6_maynards"
[60] "q6_mike_and_ike"
[61] "q6_milk_duds"
[62] "q6_milky_way"
[63] "q6_regular_m_ms"
[64] "q6_peanut_m_m_s"
[65] "q6_blue_m_ms"
[66] "q6_red_m_ms"
[67] "q6_green_party_m_ms"
[68] "q6_independent_m_ms"
[69] "q6_abstained_from_m_ming"
[70] "q6_minibags_of_chips"
[71] "q6_mint_kisses"
[72] "q6_mint_juleps"
[73] "q6_mr_goodbar"
[74] "q6_necco_wafers"
[75] "q6_nerds"
[76] "q6_nestle_crunch"
[77] "q6_nown_laters"
[78] "q6_peeps"
[79] "q6_pencils"
[80] "q6_pixy_stix"
[81] "q6_real_housewives_of_orange_county_season_9_blue_ray"
[82] "q6_reese_s_peanut_butter_cups"
[83] "q6_reeses_pieces"
[84] "q6_reggie_jackson_bar"
[85] "q6_rolos"
[86] "q6_sandwich_sized_bags_filled_with_boo_berry_crunch"
[87] "q6_skittles"
[88] "q6_smarties_american"
[89] "q6_smarties_commonwealth"
[90] "q6_snickers"
[91] "q6_sourpatch_kids_i_e_abominations_of_nature"
[92] "q6_spotted_dick"
[93] "q6_starburst"
[94] "q6_sweet_tarts"
[95] "q6_swedish_fish"
[96] "q6_sweetums_a_friend_to_diabetes"
[97] "q6_take_5"
[98] "q6_tic_tacs"
[99] "q6_those_odd_marshmallow_circus_peanut_things"
[100] "q6_three_musketeers"
[101] "q6_tolberone_something_or_other"
[102] "q6_trail_mix"
[103] "q6_twix"
[104] "q6_vials_of_pure_high_fructose_corn_syrup_for_main_lining_into_your_vein"
[105] "q6_vicodin"
[106] "q6_whatchamacallit_bars"
[107] "q6_white_bread"
[108] "q6_whole_wheat_anything"
[109] "q6_york_peppermint_patties"
[110] "q7_joy_other"
[111] "q8_despair_other"
[112] "q9_other_comments"
[113] "q10_dress"
[114] "x114"
[115] "q11_day"
[116] "q12_media_daily_dish"
[117] "q12_media_science"
[118] "q12_media_espn"
[119] "q12_media_yahoo"
[120] "click_coordinates_x_y"
# Columns chosen for 2017
chosen_cols_2017 <-
cc_y2017_candy_data %>%
subset(
select = -c(
internal_id,
q5_state_province_county_etc,
q7_joy_other:click_coordinates_x_y)
)
chosen_cols_2017
cc_y2015_candy_data %>%
select(please_estimate_the_degree_s_of_separation_you_have_from_the_following_celebrities_jk_rowling, please_estimate_the_degrees_of_separation_you_have_from_the_following_folks_jk_rowling)
cc_y2015_candy_data %>%
group_by(please_estimate_the_degrees_of_separation_you_have_from_the_following_folks_jk_rowling) %>%
count(please_estimate_the_degrees_of_separation_you_have_from_the_following_folks_jk_rowling)
When we are cleaning the ages, we should check if there are any numbers typed out for some reason. If not, we should just put in NAs for any values not in realistic range.
Tomorrow, before the join, we rename the columns, or just match them up with those contained in other datasets. We should also think about sticking in an extra year/idea column for now which will allow us to make sure everthing is working.
full_join(chosen_cols_2015, chosen_cols_2016)
Joining, by = c("how_old_are_you", "are_you_going_actually_going_trick_or_treating_yourself", "butterfinger", "x100_grand_bar", "anonymous_brown_globs_that_come_in_black_and_orange_wrappers", "any_full_sized_candy_bar", "black_jacks", "bottle_caps", "cadbury_creme_eggs", "candy_corn", "vials_of_pure_high_fructose_corn_syrup_for_main_lining_into_your_vein", "candy_that_is_clearly_just_the_stuff_given_out_for_free_at_restaurants", "cash_or_other_forms_of_legal_tender", "chiclets", "caramellos", "snickers", "dental_paraphenalia", "dots", "fuzzy_peaches", "generic_brand_acetaminophen", "glow_sticks", "broken_glow_stick", "goo_goo_clusters", "good_n_plenty", "gum_from_baseball_cards", "gummy_bears_straight_up", "creepy_religious_comics_chick_tracts", "healthy_fruit", "heath_bar", "hershey_s_milk_chocolate", "hugs_actual_physical_hugs", "jolly_rancher_bad_flavor", "jolly_ranchers_good_flavor", "kale_smoothie", "kinder_happy_hippo", "kit_kat", "hard_candy", "lemon_heads", "licorice_not_black", "lindt_truffle", "lollipops", "mars", "mary_janes", "maynards", "milk_duds", "laffy_taffy", "minibags_of_chips", "joy_joy_mit_iodine", "reggie_jackson_bar", "pixy_stix", "nerds", "nestle_crunch", "nown_laters", "pencils", "milky_way", "reese_s_peanut_butter_cups", "tolberone_something_or_other", "junior_mints", "senior_mints", "mint_kisses", "mint_juleps", "peanut_m_m_s", "regular_m_ms", "rolos", "skittles", "smarties_american", "smarties_commonwealth", "chick_o_sticks_we_don_t_know_what_that_is", "spotted_dick", "starburst", "swedish_fish", "those_odd_marshmallow_circus_peanut_things", "three_musketeers", "trail_mix", "twix", "vicodin", "white_bread", "whole_wheat_anything", "york_peppermint_patties", "necco_wafers")
chosen_cols_2015 %>%
names()
[1] "how_old_are_you"
[2] "are_you_going_actually_going_trick_or_treating_yourself"
[3] "butterfinger"
[4] "x100_grand_bar"
[5] "anonymous_brown_globs_that_come_in_black_and_orange_wrappers"
[6] "any_full_sized_candy_bar"
[7] "black_jacks"
[8] "bonkers"
[9] "bottle_caps"
[10] "box_o_raisins"
[11] "brach_products_not_including_candy_corn"
[12] "bubble_gum"
[13] "cadbury_creme_eggs"
[14] "candy_corn"
[15] "vials_of_pure_high_fructose_corn_syrup_for_main_lining_into_your_vein"
[16] "candy_that_is_clearly_just_the_stuff_given_out_for_free_at_restaurants"
[17] "cash_or_other_forms_of_legal_tender"
[18] "chiclets"
[19] "caramellos"
[20] "snickers"
[21] "dark_chocolate_hershey"
[22] "dental_paraphenalia"
[23] "dots"
[24] "fuzzy_peaches"
[25] "generic_brand_acetaminophen"
[26] "glow_sticks"
[27] "broken_glow_stick"
[28] "goo_goo_clusters"
[29] "good_n_plenty"
[30] "gum_from_baseball_cards"
[31] "gummy_bears_straight_up"
[32] "creepy_religious_comics_chick_tracts"
[33] "healthy_fruit"
[34] "heath_bar"
[35] "hershey_s_kissables"
[36] "hershey_s_milk_chocolate"
[37] "hugs_actual_physical_hugs"
[38] "jolly_rancher_bad_flavor"
[39] "jolly_ranchers_good_flavor"
[40] "kale_smoothie"
[41] "kinder_happy_hippo"
[42] "kit_kat"
[43] "hard_candy"
[44] "lapel_pins"
[45] "lemon_heads"
[46] "licorice"
[47] "licorice_not_black"
[48] "lindt_truffle"
[49] "lollipops"
[50] "mars"
[51] "mary_janes"
[52] "maynards"
[53] "milk_duds"
[54] "laffy_taffy"
[55] "minibags_of_chips"
[56] "joy_joy_mit_iodine"
[57] "reggie_jackson_bar"
[58] "pixy_stix"
[59] "nerds"
[60] "nestle_crunch"
[61] "nown_laters"
[62] "pencils"
[63] "milky_way"
[64] "reese_s_peanut_butter_cups"
[65] "tolberone_something_or_other"
[66] "runts"
[67] "junior_mints"
[68] "senior_mints"
[69] "mint_kisses"
[70] "mint_juleps"
[71] "mint_leaves"
[72] "peanut_m_m_s"
[73] "regular_m_ms"
[74] "mint_m_ms"
[75] "ribbon_candy"
[76] "rolos"
[77] "skittles"
[78] "smarties_american"
[79] "smarties_commonwealth"
[80] "chick_o_sticks_we_don_t_know_what_that_is"
[81] "spotted_dick"
[82] "starburst"
[83] "swedish_fish"
[84] "sweetums"
[85] "those_odd_marshmallow_circus_peanut_things"
[86] "three_musketeers"
[87] "peterson_brand_sidewalk_chalk"
[88] "peanut_butter_bars"
[89] "peanut_butter_jars"
[90] "trail_mix"
[91] "twix"
[92] "vicodin"
[93] "white_bread"
[94] "whole_wheat_anything"
[95] "york_peppermint_patties"
[96] "sea_salt_flavored_stuff_probably_chocolate_since_this_is_the_it_flavor_of_the_year"
[97] "necco_wafers"
chosen_cols_2016 %>%
names
[1] "are_you_going_actually_going_trick_or_treating_yourself"
[2] "your_gender"
[3] "how_old_are_you"
[4] "which_country_do_you_live_in"
[5] "x100_grand_bar"
[6] "anonymous_brown_globs_that_come_in_black_and_orange_wrappers"
[7] "any_full_sized_candy_bar"
[8] "black_jacks"
[9] "bonkers_the_candy"
[10] "bonkers_the_board_game"
[11] "bottle_caps"
[12] "boxo_raisins"
[13] "broken_glow_stick"
[14] "butterfinger"
[15] "cadbury_creme_eggs"
[16] "candy_corn"
[17] "candy_that_is_clearly_just_the_stuff_given_out_for_free_at_restaurants"
[18] "caramellos"
[19] "cash_or_other_forms_of_legal_tender"
[20] "chardonnay"
[21] "chick_o_sticks_we_don_t_know_what_that_is"
[22] "chiclets"
[23] "coffee_crisp"
[24] "creepy_religious_comics_chick_tracts"
[25] "dental_paraphenalia"
[26] "dots"
[27] "dove_bars"
[28] "fuzzy_peaches"
[29] "generic_brand_acetaminophen"
[30] "glow_sticks"
[31] "goo_goo_clusters"
[32] "good_n_plenty"
[33] "gum_from_baseball_cards"
[34] "gummy_bears_straight_up"
[35] "hard_candy"
[36] "healthy_fruit"
[37] "heath_bar"
[38] "hersheys_dark_chocolate"
[39] "hershey_s_milk_chocolate"
[40] "hersheys_kisses"
[41] "hugs_actual_physical_hugs"
[42] "jolly_rancher_bad_flavor"
[43] "jolly_ranchers_good_flavor"
[44] "joy_joy_mit_iodine"
[45] "junior_mints"
[46] "senior_mints"
[47] "kale_smoothie"
[48] "kinder_happy_hippo"
[49] "kit_kat"
[50] "laffy_taffy"
[51] "lemon_heads"
[52] "licorice_not_black"
[53] "licorice_yes_black"
[54] "lindt_truffle"
[55] "lollipops"
[56] "mars"
[57] "mary_janes"
[58] "maynards"
[59] "mike_and_ike"
[60] "milk_duds"
[61] "milky_way"
[62] "regular_m_ms"
[63] "peanut_m_m_s"
[64] "blue_m_ms"
[65] "red_m_ms"
[66] "third_party_m_ms"
[67] "minibags_of_chips"
[68] "mint_kisses"
[69] "mint_juleps"
[70] "mr_goodbar"
[71] "necco_wafers"
[72] "nerds"
[73] "nestle_crunch"
[74] "nown_laters"
[75] "peeps"
[76] "pencils"
[77] "person_of_interest_season_3_dvd_box_set_not_including_disc_4_with_hilarious_outtakes"
[78] "pixy_stix"
[79] "reese_s_peanut_butter_cups"
[80] "reeses_pieces"
[81] "reggie_jackson_bar"
[82] "rolos"
[83] "skittles"
[84] "smarties_american"
[85] "smarties_commonwealth"
[86] "snickers"
[87] "sourpatch_kids_i_e_abominations_of_nature"
[88] "spotted_dick"
[89] "starburst"
[90] "sweet_tarts"
[91] "swedish_fish"
[92] "sweetums_a_friend_to_diabetes"
[93] "tic_tacs"
[94] "those_odd_marshmallow_circus_peanut_things"
[95] "three_musketeers"
[96] "tolberone_something_or_other"
[97] "trail_mix"
[98] "twix"
[99] "vials_of_pure_high_fructose_corn_syrup_for_main_lining_into_your_vein"
[100] "vicodin"
[101] "whatchamacallit_bars"
[102] "white_bread"
[103] "whole_wheat_anything"
[104] "york_peppermint_patties"
#Here, I added ids and years to each datasets. I also made sure that any observations that had no values in any variable were removed from the datasets. Looks like this only affected the 2017 data.
cc_id_2015 <-
chosen_cols_2015 %>%
filter_all(any_vars(!is.na(.))) %>%
mutate(year = 2015) %>%
tibble::rowid_to_column("id")
cc_id_2016 <-
chosen_cols_2016 %>%
filter_all(any_vars(!is.na(.))) %>%
mutate(year = 2016) %>%
tibble::rowid_to_column("id")
cc_id_2017 <-
chosen_cols_2017 %>%
filter_all(any_vars(!is.na(.))) %>%
mutate(year = 2017) %>%
tibble::rowid_to_column("id")
cc_id_2015
cc_id_2016
cc_id_2017
cc_pivot_2015 <-
cc_id_2015 %>%
pivot_longer(
cols = c(butterfinger:necco_wafers),
names_to = "candy_type",
values_to = "rating"
)
cc_pivot_2015
cc_pivot_2016 <-
cc_id_2016 %>%
pivot_longer(
cols = c(x100_grand_bar: york_peppermint_patties),
names_to = "candy_type",
values_to = "rating"
)
cc_pivot_2016
cc_pivot_2017 <-
cc_id_2017 %>%
pivot_longer(
cols = c(q6_100_grand_bar: q6_york_peppermint_patties),
names_to = "candy_type",
values_to = "rating"
) %>%
mutate(candy_type = str_remove(candy_type, fixed("q6_")))
cc_pivot_2017
cc_pivot_2015_rename <-
cc_pivot_2015 %>%
rename(
age = how_old_are_you,
going_out = are_you_going_actually_going_trick_or_treating_yourself,
)
cc_pivot_2015_rename
cc_pivot_2016_rename <-
cc_pivot_2016 %>%
rename(
going_out = are_you_going_actually_going_trick_or_treating_yourself,
gender = your_gender,
age = how_old_are_you,
country = which_country_do_you_live_in
)
cc_pivot_2016_rename
cc_pivot_2017_rename <-
cc_pivot_2017 %>%
rename(
going_out = q1_going_out,
gender = q2_gender,
age = q3_age,
country = q4_country
)
cc_pivot_2017_rename
joined_candy <-
bind_rows(cc_pivot_2015_rename, cc_pivot_2016_rename, cc_pivot_2017_rename)
joined_candy
Right, now that we’ve got the join done, let’s go through each of the columns and make sure that everything is clean (enough!)
joined_candy %>%
distinct(gender)
#So by the look of it we want to retain all integer and convert all numbers with decimals to the closest integer. We also want to make sure that the ages are within a certain realistic age range.
pars_exper <-
joined_candy %>%
mutate(age = parse_guess(age))
pars_exper %>%
distinct(age)
integer_exper <-
joined_candy %>%
mutate(age = as.integer(age))
integer_exper %>%
distinct(age)
finish_exper <-
integer_exper %>%
mutate(age = replace(age, age < 4 | age > 120, NA))
#So by the look of it we want to retain all integer and convert all numbers with decimals to the closest integer. We also want to make sure that the ages are within a certain realistic age range.
jc_age_done <-
joined_candy %>%
mutate(age = as.integer(age)) %>%
mutate(age = replace(age, age < 4 | age > 120, NA))
jc_age_done %>%
distinct(age)
jc_age_done %>%
group_by(going_out) %>%
summarise(count = n()
)
#Right, so it looks like there are a couple of candies that are repeated in different forms. I'm going to keep these the way they are right now and then maybe provide alternativer answers in the analysis i.e. if we consider "anonymous_brown_globs" as mary_janes as suggested then maybe we can amalgamate these scores. We would need to find a way of making sure that doublecounting wasn't involved. ie. id 3 from 2015 cannot register two votes for both names of mary janes.
jc_age_done %>%
distinct(candy_type) %>%
arrange(candy_type)
#This is the biggie, let's see what we can do with the countries
jc_age_done %>%
distinct(gender)
num_for_country <-
jc_age_done %>%
mutate(country_num_detect = str_detect(country, "[0-9]+[0-9]"))
num_for_country
num_changed_for_country <-
num_for_country %>%
mutate(age = if_else(country_num_detect == TRUE, country, age))
Error: Problem with `mutate()` input `age`.
x `false` must be a character vector, not an integer vector.
ℹ Input `age` is `if_else(country_num_detect == TRUE, country, age)`.
Run `rlang::last_error()` to see where the error occurred.
num_changed_for_country %>%
distinct(age)
#Given up on this, just going to hard code the misplaced ages in.
num_for_country_hard_code <-
jc_age_done %>%
mutate(country_num_detect = str_detect(country, "[0-9]+[0-9]")) %>%
filter(country_num_detect == TRUE) %>%
select(id, year, country)
num_for_country_hard_code
#Hardcoding the replacement ages
jc_age_extras <-
jc_age_done %>%
mutate(
age = replace(age, id == 117 & year == 2016, 51),
age = replace(age, id == 303 & year == 2016, 47),
age = replace(age, id == 622 & year == 2016, 54),
age = replace(age, id == 623 & year == 2016, 54),
age = replace(age, id == 692 & year == 2016, 44),
age = replace(age, id == 829 & year == 2016, 45),
age = replace(age, id == 1101 & year == 2016, 30),
age = replace(age, id == 186 & year == 2017, 35),
age = replace(age, id == 558 & year == 2017, 46),
age = replace(age, id == 728 & year == 2017, 45),
)
jc_age_extras
jc_age_extras %>%
distinct(gender)
#Before getting recoding the countries, I want to get rid of the numeric and nonsense values
clean_candy_complete <-
jc_age_extras %>%
mutate(
country = replace(country, country %in% c(
"usa", "US", "United States of America", "uSA", "united states", "United States", "us", "USSA", "U.S.A", "Murica", "USA!", "Usa", "U.S.", "Us", "Units States", "United states", "USA USA USA", "the best one - usa", "USA! USA! USA!",
"Cascadia", "u.s.", "The Yoo Ess of Aaayyyyyy", "united states of america", "USA!!!!!!", "USA! USA!", "United Sates", "Sub-Canadian North America... 'Merica", "Trumpistan", "U.s.", 'Merica', "UNited States", "United Stetes", "america", "The republic of Cascadia", "USA USA USA USA", "United States of America", "United State", "United staes", "u.s.a.", "USAUSAUSA","US of A", "Unites States", "The United States", "Unied States", "U S", "The United States of America", "unite states", "cascadia", "USA? Hard to tell anymore..", "‘merica", "usas", "Pittsburgh", "New York", "California", "USa", "I pretend to be from Canada, but I am really from the United States.", "United Stated", "Ahem….Amerca", "New Jersey", "United ststes", "United Statss", "murrika", "USAA", "Alaska", "united States", "u s a", "United Statea", "united ststes", "USA USA USA!!!!", "U.S.A.", "USA (I think but it's an election year so who can really tell)", "America", "United States of America", "'merica", "Ahem....Amerca", "North Carolina", "United States of America ")
,"USA")
)%>%
mutate(
country = replace(country, country %in% c(
"canada", "Canada`", "Can", "CANADA")
,"Canada"
)
)%>%
mutate(
country = replace(country, country %in% c(
"uk", "United Kingdom", "England", "england", "United Kindom", "U.K.", "Uk", "Scotland", "United kingdom")
,"UK"
)
) %>%
mutate(
country = replace(country, country %in% c(
"Japan", "france", "A tropical island south of the equator", "Switzerland", "Korea", "belgium", "croatia", "Portugal", "españa", "Panama", "France", "Australia", "hungary", "Austria", "New Zealand", "Germany", "Mexico", "Brasil", "South Korea", "Philippines", "sweden", "The Netherlands", "Finland", "China", "germany", "kenya", "Netherlands", "netherlands", "UAE", "finland", "Europe", "Costa Rica", "Greece", "australia", "Canae", "Ireland", "South africa", "Iceland", "Denmark", "Indonesia", "Singapore", "Taiwan", "hong kong", "spain", "Sweden", "Hong Kong")
,"Other country")
) %>%
mutate(
country = replace(country, !country %in% c("Canada", "USA", "UK", "Other country")
, NA
)
)
clean_candy_complete %>%
distinct(gender)
clean_candy_complete <-
clean_candy_complete %>%
select(id, year, country, age, gender, going_out, candy_type, rating) %>%
arrange(year, id)
clean_candy_complete
clean_candy_complete %>%
distinct(gender)
write_csv(clean_candy_complete, (here::here("clean_data/clean_candy_script.csv")))